source('misc_functions/functions.R')
set.seed(123)
library(mgcv)
dat = gamSim(1, n=400, dist="normal", scale=1, verbose=F)
mod = lm(y ~ x1 + x2, data=dat)
summary(mod)
Call:
lm(formula = y ~ x1 + x2, data = dat)
Residuals:
Min 1Q Median 3Q Max
-8.6634 -1.3220 -0.0571 1.5297 7.0937
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.2745 0.3332 21.83 <2e-16 ***
x1 6.3616 0.4352 14.62 <2e-16 ***
x2 -5.1214 0.4526 -11.31 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.532 on 397 degrees of freedom
Multiple R-squared: 0.4594, Adjusted R-squared: 0.4567
F-statistic: 168.7 on 2 and 397 DF, p-value: < 2.2e-16
plot(mod, which = 1:2)
# requires broom and glue packages
broom::augment(mod) %>%
ggplot(aes(x=.fitted, y=y)) +
geom_point(alpha=.25, color='#ff5500') +
geom_smooth(se=F, color='#00aaff') +
annotate('text',
label = glue::glue("Rsq = {round(summary(mod)$r.squared, 2)}"),
x = 4,
y = 16) +
labs(title='Fitted vs. Observed') +
theme_minimal()
dat %>%
select(x1, x2, y) %>%
gather(key=variable, value=Predictor, -y) %>%
ggplot(aes(x=Predictor, y=y)) +
geom_point(alpha=.25, color='#ff5500') +
geom_smooth(aes(), color='#00aaff', se=F) +
facet_grid(~variable) +
labs(title='Predictors vs. Y') +
theme_trueMinimal()
modlog = lm(log(y) ~ x1 + x2, dat)
summary(modlog)
Call:
lm(formula = log(y) ~ x1 + x2, data = dat)
Residuals:
Min 1Q Median 3Q Max
-3.07226 -0.13090 0.05803 0.22780 0.79379
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.82673 0.05398 33.839 <2e-16 ***
x1 0.93691 0.07050 13.290 <2e-16 ***
x2 -0.68719 0.07333 -9.371 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4102 on 397 degrees of freedom
Multiple R-squared: 0.3968, Adjusted R-squared: 0.3937
F-statistic: 130.6 on 2 and 397 DF, p-value: < 2.2e-16
plot(modlog, which = 1:2)
broom::augment(mod) %>%
ggplot(aes(x=.fitted, y=y)) +
geom_point(alpha=.25, color='#ff5500') +
geom_smooth(se=F, color='#00aaff') +
annotate('text',
label = glue::glue("Rsq = {round(summary(modlog)$r.squared, 2)}"),
x = 4,
y = 16) +
labs(title='Fitted vs. Observed') +
theme_trueMinimal()
set.seed(123)
x = runif(500)
mu = sin(2 * (4 * x - 2)) + 2 * exp(-(16 ^ 2) * ((x - .5) ^ 2))
y = rnorm(500, mu, .3)
d = data.frame(x,y)
A standard linear regression is definitely not going to capture this relationship. As above, we could try and use polynomial regression here, e.g. fitting a quadratic or cubic function within the standard regression framework. However, this is unrealistic at best and at worst isn’t useful for complex relationships. In the following, even with a polynomial of degree 15 the fit is fairly poor in many areas, and ‘wiggles’ in some places where there doesn’t appear to be a need to.
library(plotly) # install if you don't have
fits = sapply(seq(3,15, 3), function(p) fitted(lm(y~poly(x,p)))) %>%
data.frame(x, y, .) %>%
gather(key=polynomial, value=fits, -x, -y) %>%
mutate(polynomial = factor(polynomial, labels=seq(3,15, 3)))
plot_ly(data=d) %>%
add_markers(~x, ~y, marker=list(color='#ff5500', opacity=.2), showlegend=F) %>%
add_lines(~x, ~fits, color=~polynomial, data=fits) %>%
theme_plotly()
NA